- 群間で差異のある分子群および生物学的な機能など調べる一連の解析
home = "~/hgc2019" setwd(home)
files = list.files("data/GSE63310_RAW",full.names = TRUE)
files[1:3]
## [1] "data/GSE63310_RAW/GSM1545535_10_6_5_11.txt.gz" ## [2] "data/GSE63310_RAW/GSM1545536_9_6_5_11.txt.gz" ## [3] "data/GSE63310_RAW/GSM1545537_mo906111-1_m09611-2.txt.gz"
selectfiles =
!(files %in%
c("data/GSE63310_RAW/GSM1545537_mo906111-1_m09611-2.txt.gz",
"data/GSE63310_RAW/GSM1545543_JMS9-CDBG.txt.gz"))
files = files[selectfiles]
library(limma) library(edgeR) x = readDGE(files,columns = c(1,3))
class(x) # オブジェクトの型
## [1] "DGEList" ## attr(,"package") ## [1] "edgeR"
names(x) # 要素名
## [1] "samples" "counts"
class(x$samples)
## [1] "data.frame"
class(x$counts)
## [1] "matrix"
dim(x$samples)
## [1] 9 4
dim(x$counts)
## [1] 27179 9
head(x$samples) head(x$counts)
dim(x$samples)
## [1] 9 4
colnames(x$samples)
## [1] "files" "group" "lib.size" "norm.factors"
x$samples$files[1:5]
## [1] "data/GSE63310_RAW/GSM1545535_10_6_5_11.txt.gz" ## [2] "data/GSE63310_RAW/GSM1545536_9_6_5_11.txt.gz" ## [3] "data/GSE63310_RAW/GSM1545538_purep53.txt.gz" ## [4] "data/GSE63310_RAW/GSM1545539_JMS8-2.txt.gz" ## [5] "data/GSE63310_RAW/GSM1545540_JMS8-3.txt.gz"
x$samples$group[1:5]
## [1] 1 1 1 1 1 ## Levels: 1
x$samples$lib.size[1:5]
## [1] 32863052 35335491 57160817 51368625 75795034
x$samples$norm.factors[1:5]
## [1] 1 1 1 1 1
colnames(x)[1:2] #Before
## [1] "data/GSE63310_RAW/GSM1545535_10_6_5_11.txt" ## [2] "data/GSE63310_RAW/GSM1545536_9_6_5_11.txt"
samplenames = basename(colnames(x)) #ファイルパスからファイル名のみ抽出
samplenames = gsub("GSM|.txt","",samplenames) # "|"はORという意味
samplenames[1:2] #After
## [1] "1545535_10_6_5_11" "1545536_9_6_5_11"
colnames(x) = samplenames colnames(x)[1:5]
## [1] "1545535_10_6_5_11" "1545536_9_6_5_11" "1545538_purep53" ## [4] "1545539_JMS8-2" "1545540_JMS8-3"
rownames(x$samples)[1:5]
## [1] "1545535_10_6_5_11" "1545536_9_6_5_11" "1545538_purep53" ## [4] "1545539_JMS8-2" "1545540_JMS8-3"
group <- as.factor(c("LP", "ML", "Basal",
"Basal", "ML", "LP", "Basal", "ML", "LP"))
x$samples$group = group
x$samples$group
## [1] LP ML Basal Basal ML LP Basal ML LP ## Levels: Basal LP ML
x$samples$group = group
lane = as.factor(rep(c("L004","L006","L008"), c(3,4,2)))
x$samples$lane = lane
x$samples
遺伝子アノテーション
library(Mus.musculus)
geneid = rownames(x)
genes = select(Mus.musculus, keys=geneid,columns=c("SYMBOL", "TXCHROM"),
keytype="ENTREZID")
dim(genes)
## [1] 27206 3
head(genes,n=2)
## ENTREZID SYMBOL TXCHROM ## 1 497097 Xkr4 chr1 ## 2 100503874 Gm19938 <NA>
columns(Mus.musculus)
## [1] "ACCNUM" "ALIAS" "CDSCHROM" "CDSEND" "CDSID" ## [6] "CDSNAME" "CDSPHASE" "CDSSTART" "CDSSTRAND" "DEFINITION" ## [11] "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "ENTREZID" "ENZYME" ## [16] "EVIDENCE" "EVIDENCEALL" "EXONCHROM" "EXONEND" "EXONID" ## [21] "EXONNAME" "EXONRANK" "EXONSTART" "EXONSTRAND" "GENEID" ## [26] "GENENAME" "GO" "GOALL" "GOID" "IPI" ## [31] "MGI" "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" ## [36] "PMID" "PROSITE" "REFSEQ" "SYMBOL" "TERM" ## [41] "TXCHROM" "TXEND" "TXID" "TXNAME" "TXSTART" ## [46] "TXSTRAND" "TXTYPE" "UNIGENE" "UNIPROT"
tb = table(genes$ENTREZID) dupId = names(tb[tb > 1]) genes[genes$ENTREZID %in% dupId[5:6],]
## ENTREZID SYMBOL TXCHROM ## 11545 100217457 Snord58b chr14 ## 11546 100217457 Snord58b chr18 ## 16094 100042555 Gm13305 chr4 ## 16095 100042555 Gm13305 chr4_JH584293_random ## 16096 100042555 Gm13305 chr4_JH584294_random
duplicated()関数は重複している一つ目以外の要素をTRUE/FALSEで返すg = rep(LETTERS[1:3],each=3) duplicated(g)
## [1] FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE
g[duplicated(g)]
## [1] "A" "A" "B" "B" "C" "C"
g[!duplicated(g)]
## [1] "A" "B" "C"
duplicated()関数を用いて重複を機械的に除去genes = genes[!duplicated(genes$ENTREZID),]
xにgenesという変数を追加match(ref,target)はrefの各要素がtargetの要素に対応する添字を返す関数## Samples ## Tags 1545535_10_6_5_11 1545536_9_6_5_11 1545538_purep53 ## 497097 1 2 342 ## 100503874 0 0 5 ## 100038431 0 0 0
x$samples$lib.size
## [1] 32863052 35335491 57160817 51368625 75795034 60517657 55086324 21311068 ## [9] 19958838
cpm = cpm(x) # CPM without logarithm calculation
1)lcpm = cpm(x, log=TRUE) # log2CPM
voom関数でも内部でlog2変換を自動で行っているtable(rowSums(x$counts==0) == 9)
## ## FALSE TRUE ## 22026 5153
keep.exprs = filterByExpr(x, group=group) x = x[keep.exprs,, keep.lib.sizes=FALSE] dim(x)
## [1] 16624 9
filterByExpr()関数を用いるkeep.lib.size=FALSEはライブラリサイズの再計算をする否か (なくても影響ない)xのx$norm.factorが初期値から自動更新されるx = calcNormFactors(x,method="TMM") x$samples$norm.factors
## [1] 0.8943956 1.0250186 1.0459005 1.0458455 1.0162707 0.9217132 0.9961959 ## [8] 1.0861026 0.9839203
cpm = cpm(x) #補正後のCPM lcpm = cpm(x,log=TRUE)
lcpm = cpm(x,log=TRUE) #フィルタリングCPMの再計算 label = x$samples$group laneCol = as.numeric(x$samples$lane) plotMDS(lcpm,labels=label,col=laneCol,dim.plot = c(1,2))
plotMDS(lcpm,labels=label,col=laneCol,dim.plot = c(3,4))
測定誤差の影響を出来るだけ除去したい測定誤差と生物学的ばらつきに分解(Law et al. 2014)
生物学的ばらつきはリード数に依存せず一定測定誤差のみがリード数に依存して変化するgroup = x$samples$group
lane = x$samples$lane
design <- model.matrix(~0+group+lane)
colnames(design) <- gsub("group", "", colnames(design))
design
## Basal LP ML laneL006 laneL008 ## 1 0 1 0 0 0 ## 2 0 0 1 0 0 ## 3 1 0 0 0 0 ## 4 1 0 0 1 0 ## 5 0 0 1 1 0 ## 6 0 1 0 1 0 ## 7 1 0 0 1 0 ## 8 0 0 1 0 1 ## 9 0 1 0 0 1 ## attr(,"assign") ## [1] 1 1 1 2 2 ## attr(,"contrasts") ## attr(,"contrasts")$group ## [1] "contr.treatment" ## ## attr(,"contrasts")$lane ## [1] "contr.treatment"
contr.matrix = makeContrasts( BasalvsLP = Basal-LP, BasalvsML = Basal - ML, LPvsML = LP - ML, levels = colnames(design)) contr.matrix
## Contrasts ## Levels BasalvsLP BasalvsML LPvsML ## Basal 1 1 0 ## LP -1 0 1 ## ML 0 -1 -1 ## laneL006 0 0 0 ## laneL008 0 0 0
v = voom(x, design, plot=TRUE)
v
## An object of class "EList" ## $genes ## ENTREZID SYMBOL TXCHROM ## 1 497097 Xkr4 chr1 ## 5 20671 Sox17 chr1 ## 6 27395 Mrpl15 chr1 ## 7 18777 Lypla1 chr1 ## 9 21399 Tcea1 chr1 ## 16619 more rows ... ## ## $targets ## files group lib.size ## 1545535_10_6_5_11 data/GSE63310_RAW/GSM1545535_10_6_5_11.txt.gz LP 29387429 ## 1545536_9_6_5_11 data/GSE63310_RAW/GSM1545536_9_6_5_11.txt.gz ML 36212498 ## 1545538_purep53 data/GSE63310_RAW/GSM1545538_purep53.txt.gz Basal 59771061 ## 1545539_JMS8-2 data/GSE63310_RAW/GSM1545539_JMS8-2.txt.gz Basal 53711278 ## 1545540_JMS8-3 data/GSE63310_RAW/GSM1545540_JMS8-3.txt.gz ML 77015912 ## 1545541_JMS8-4 data/GSE63310_RAW/GSM1545541_JMS8-4.txt.gz LP 55769890 ## 1545542_JMS8-5 data/GSE63310_RAW/GSM1545542_JMS8-5.txt.gz Basal 54863512 ## 1545544_JMS9-P7c data/GSE63310_RAW/GSM1545544_JMS9-P7c.txt.gz ML 23139691 ## 1545545_JMS9-P8c data/GSE63310_RAW/GSM1545545_JMS9-P8c.txt.gz LP 19634459 ## norm.factors lane ## 1545535_10_6_5_11 0.8943956 L004 ## 1545536_9_6_5_11 1.0250186 L004 ## 1545538_purep53 1.0459005 L004 ## 1545539_JMS8-2 1.0458455 L006 ## 1545540_JMS8-3 1.0162707 L006 ## 1545541_JMS8-4 0.9217132 L006 ## 1545542_JMS8-5 0.9961959 L006 ## 1545544_JMS9-P7c 1.0861026 L008 ## 1545545_JMS9-P8c 0.9839203 L008 ## ## $E ## Samples ## Tags 1545535_10_6_5_11 1545536_9_6_5_11 1545538_purep53 1545539_JMS8-2 ## 497097 -4.292165 -3.856488 2.5185849 3.2931366 ## 20671 -4.292165 -4.593453 0.3560126 -0.4073032 ## 27395 3.876089 4.413107 4.5170045 4.5617546 ## 18777 4.708774 5.571872 5.3964008 5.1623650 ## 21399 4.785541 4.754537 5.3703795 5.1220551 ## Samples ## Tags 1545540_JMS8-3 1545541_JMS8-4 1545542_JMS8-5 1545544_JMS9-P7c ## 497097 -4.459730 -3.994060 3.2869677 -3.2103696 ## 20671 -1.200995 -1.943434 0.8442767 -0.3228444 ## 27395 4.344401 3.786363 3.8990635 4.3396075 ## 18777 5.649355 5.081611 5.0602470 5.7513694 ## 21399 4.869586 4.943840 5.1384776 5.0308985 ## Samples ## Tags 1545545_JMS9-P8c ## 497097 -5.295316 ## 20671 -1.207853 ## 27395 4.124644 ## 18777 5.142436 ## 21399 4.979644 ## 16619 more rows ... ## ## $weights ## [,1] [,2] [,3] [,4] [,5] [,6] [,7] ## [1,] 1.079413 1.332986 19.826915 20.273253 1.993686 1.395853 20.494977 ## [2,] 1.170357 1.456380 4.804866 8.660025 3.612508 2.626870 8.760149 ## [3,] 20.219073 25.573792 30.434759 28.528310 31.352260 25.743247 28.722497 ## [4,] 26.947557 32.505933 33.583128 33.232125 34.231754 32.354158 33.334340 ## [5,] 26.610864 28.501638 33.645479 33.206374 33.573492 31.996623 33.308490 ## [,8] [,9] ## [1,] 1.107780 1.079413 ## [2,] 3.211473 2.541942 ## [3,] 21.200072 16.657930 ## [4,] 30.348630 24.259801 ## [5,] 25.171513 23.573305 ## 16619 more rows ... ## ## $design ## Basal LP ML laneL006 laneL008 ## 1 0 1 0 0 0 ## 2 0 0 1 0 0 ## 3 1 0 0 0 0 ## 4 1 0 0 1 0 ## 5 0 0 1 1 0 ## 6 0 1 0 1 0 ## 7 1 0 0 1 0 ## 8 0 0 1 0 1 ## 9 0 1 0 0 1 ## attr(,"assign") ## [1] 1 1 1 2 2 ## attr(,"contrasts") ## attr(,"contrasts")$group ## [1] "contr.treatment" ## ## attr(,"contrasts")$lane ## [1] "contr.treatment"
topTableで上位の有意だった遺伝子群を表示?topTableでヘルプ参照coefでコントラスト行列で定義した比較名を指定vfit = lmFit(v, design) vfit = contrasts.fit(vfit, contrasts=contr.matrix) efit = eBayes(vfit)
5%)decideTestsでは比較条件数の多重検定補正を行うtreat()関数とは異なりp値のみから比較条件間の多重補正を行う
treatを用いる方が良いsummary(decideTests(efit))
## BasalvsLP BasalvsML LPvsML ## Down 4648 4927 3135 ## NotSig 7113 7026 10972 ## Up 4863 4671 2517
vennDiagram()関数にdecideTests()関数の結果を入れるdt0 = decideTests(efit) vennDiagram(dt0,circle.col = 1:ncol(dt0),main = "Differential Expressed Genes (LFC >= 0)")
treat()関数
decideTests()よりも厳密な比較lfcよりも十分に大きい変動かを検定するlfc=0にすればデフォルトの最も条件の緩いeBayes()と同じ結果となるtfit = treat(efit,lfc = 1) summary(decideTests(tfit))
## BasalvsLP BasalvsML LPvsML ## Down 1632 1777 224 ## NotSig 12976 12790 16210 ## Up 2016 2057 190
dt = decideTests(tfit) vennDiagram(dt,circle.col = 1:ncol(dt),main="Differential Expressed Genes (LFC>=1)")
decideTestで得られた結果には遺伝子(行)ごとに条件間(列)で有意か否かの情報がある1 (Up), 0 (Not significant), -1 (Down)を表しているdt
## TestResults matrix ## Contrasts ## BasalvsLP BasalvsML LPvsML ## 497097 1 1 0 ## 20671 0 0 0 ## 27395 0 0 0 ## 18777 0 0 0 ## 21399 0 0 0 ## 16619 more rows ...
de.common = which(dt[,1]!=0 & dt[,2]!=0 & dt[,3]!=0) length(de.common)
## [1] 193
head(tfit$genes$SYMBOL[de.common], n=20)
## [1] "Prss39" "Plcl1" "Pigr" "Rnasel" "Glul" "Fmo3" "Esr1" ## [8] "Myb" "Enpp3" "Arg1" "Epb41l2" "Cdk19" "Ccdc162" "Pkib" ## [15] "H2afy2" "Dnajc12" "Reep6" "Apof" "Wap" "Stc2"
topTableで全ての結果をcsv形式で保存write.fit(tfit,dt,file = "limma_result.csv",row.names = F,sep = ",")
topTreat()関数で上位の変動遺伝子を表示するcoef引数にcontr.matrixで定義した条件比較名を指定BasalvsLPの結果numberはいくつまでに結果を取り出すかの引数
Infにすると「全て」head()関数で最初の3行だけ表示しているhead(topTreat(tfit,coef="BasalvsLP",number = Inf),n=3)
## ENTREZID SYMBOL TXCHROM logFC AveExpr t P.Value ## 12759 12759 Clu chr14 -5.455444 8.856581 -33.55508 1.723731e-10 ## 53624 53624 Cldn7 chr11 -5.527356 6.295437 -31.97515 2.576972e-10 ## 242505 242505 Rasef chr4 -5.935249 5.118259 -31.33407 3.081544e-10 ## adj.P.Val ## 12759 1.707586e-06 ## 53624 1.707586e-06 ## 242505 1.707586e-06
plotMD(tfit, column="BasalvsLP", status=dt[,"BasalvsLP"], main="BasalvsLP", xlim=c(-8,13))
volcanoplot(tfit,coef="BasalvsLP",highlight = 30,names = tfit$genes$SYMBOL)
ntop = 100 BasalvsLP = topTreat(tfit,coef="BasalvsLP",number = Inf) topGeneIds = BasalvsLP$ENTREZID[1:ntop] topGenes = BasalvsLP$SYMBOL[1:ntop] topExpr = lcpm[rownames(lcpm) %in% topGeneIds, ]
library(pheatmap)
pheatmap(topExpr,scale = "row",labels_col = group,
labels_row = topGenes,fontsize_row = 5,fontsize_col = 5)
annotation_colあるいはannotation_rowを用いるcolAnno = data.frame(group = group) rownames(colAnno) = colnames(topExpr) colAnno
## group ## 1545535_10_6_5_11 LP ## 1545536_9_6_5_11 ML ## 1545538_purep53 Basal ## 1545539_JMS8-2 Basal ## 1545540_JMS8-3 ML ## 1545541_JMS8-4 LP ## 1545542_JMS8-5 Basal ## 1545544_JMS9-P7c ML ## 1545545_JMS9-P8c LP
pheatmap(topExpr,scale = "row",labels_col = group,
labels_row = topGenes,fontsize_row = 5,fontsize_col = 5,annotation_col = colAnno)
BasalvsLP = topTreat(tfit,coef="BasalvsLP",number = Inf) rnk = BasalvsLP$t names(rnk) = BasalvsLP$ENTREZID rnk = sort(rnk,decreasing = TRUE) rnk[1:10]
## 21804 78896 269132 108153 21345 13497 66871 16665 ## 23.04028 22.92738 21.89868 21.75759 21.50626 21.20271 21.16991 20.98358 ## 108927 170643 ## 20.81574 20.63753
geneListに遺伝子ランクを指定npermにシャッフルの回数を指定(p値は\(\frac{1}{nperm}\)の精度になる)library(clusterProfiler)
gseaResKEGG = gseKEGG(geneList=rnk,organism='mmu',
nPerm=1000,pvalueCutoff=0.05,verbose=F)
head(gseaResKEGG,n=3)
## ID Description setSize enrichmentScore NES ## mmu03040 mmu03040 Spliceosome 129 0.7501444 1.729939 ## mmu04310 mmu04310 Wnt signaling pathway 138 0.6679082 1.554554 ## mmu04110 mmu04110 Cell cycle 119 0.6946123 1.582567 ## pvalue p.adjust qvalues rank leading_edge ## mmu03040 0.001824818 0.02928406 0.0226571 3286 tags=5%, list=20%, signal=4% ## mmu04310 0.001831502 0.02928406 0.0226571 2302 tags=25%, list=14%, signal=22% ## mmu04110 0.001869159 0.02928406 0.0226571 2533 tags=28%, list=15%, signal=24% ## core_enrichment ## mmu03040 15512/67678/53607/110809/545091/66373/76522 ## mmu04310 22409/21414/24117/22411/72293/14160/93960/107515/73181/22420/12006/18018/76441/407821/12444/22402/18163/26564/14369/57265/14365/27412/12325/21415/20377/14735/77583/14371/53627/14362/12323/20317/93840/106042/20671 ## mmu04110 12236/12235/268697/12442/56150/12428/107995/12531/105988/12534/12444/22137/12532/18817/12544/27401/30939/17217/17215/17218/211586/12447/12577/21781/55948/18392/17220/19650/12581/12545/17216/12237/15182